/* sp_sm2_x86_64_asm.S */
/*
 * Copyright (C) 2006-2025 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

#ifdef WOLFSSL_USER_SETTINGS
#ifdef WOLFSSL_USER_SETTINGS_ASM
/*
 * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
 * The script takes in a user_settings.h and produces user_settings_asm.h, which
 * is a stripped down version of user_settings.h containing only preprocessor
 * directives. This makes the header safe to include in assembly (.S) files.
 */
#include "user_settings_asm.h"
#else
/*
 * Note: if user_settings.h contains any C code (e.g. a typedef or function
 * prototype), including it here in an assembly (.S) file will cause an
 * assembler failure. See user_settings_asm.h above.
 */
#include "user_settings.h"
#endif /* WOLFSSL_USER_SETTINGS_ASM */
#endif /* WOLFSSL_USER_SETTINGS */

#ifndef HAVE_INTEL_AVX1
#define HAVE_INTEL_AVX1
#endif /* HAVE_INTEL_AVX1 */
#ifndef NO_AVX2_SUPPORT
#ifndef HAVE_INTEL_AVX2
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */

#ifdef WOLFSSL_SP_X86_64_ASM
#ifdef WOLFSSL_SP_SM2
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mul_sm2_4
.type	sp_256_mul_sm2_4,@function
.align	16
sp_256_mul_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mul_sm2_4
.p2align	4
_sp_256_mul_sm2_4:
#endif /* __APPLE__ */
        movq	%rdx, %rcx
        subq	$32, %rsp
        # A[0] * B[0]
        movq	(%rcx), %rax
        mulq	(%rsi)
        xorq	%r10, %r10
        movq	%rax, (%rsp)
        movq	%rdx, %r9
        # A[0] * B[1]
        movq	8(%rcx), %rax
        mulq	(%rsi)
        xorq	%r8, %r8
        addq	%rax, %r9
        adcq	%rdx, %r10
        adcq	$0x00, %r8
        # A[1] * B[0]
        movq	(%rcx), %rax
        mulq	8(%rsi)
        addq	%rax, %r9
        adcq	%rdx, %r10
        adcq	$0x00, %r8
        movq	%r9, 8(%rsp)
        # A[0] * B[2]
        movq	16(%rcx), %rax
        mulq	(%rsi)
        xorq	%r9, %r9
        addq	%rax, %r10
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        # A[1] * B[1]
        movq	8(%rcx), %rax
        mulq	8(%rsi)
        addq	%rax, %r10
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        # A[2] * B[0]
        movq	(%rcx), %rax
        mulq	16(%rsi)
        addq	%rax, %r10
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        movq	%r10, 16(%rsp)
        # A[0] * B[3]
        movq	24(%rcx), %rax
        mulq	(%rsi)
        xorq	%r10, %r10
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %r10
        # A[1] * B[2]
        movq	16(%rcx), %rax
        mulq	8(%rsi)
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %r10
        # A[2] * B[1]
        movq	8(%rcx), %rax
        mulq	16(%rsi)
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %r10
        # A[3] * B[0]
        movq	(%rcx), %rax
        mulq	24(%rsi)
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %r10
        movq	%r8, 24(%rsp)
        # A[1] * B[3]
        movq	24(%rcx), %rax
        mulq	8(%rsi)
        xorq	%r8, %r8
        addq	%rax, %r9
        adcq	%rdx, %r10
        adcq	$0x00, %r8
        # A[2] * B[2]
        movq	16(%rcx), %rax
        mulq	16(%rsi)
        addq	%rax, %r9
        adcq	%rdx, %r10
        adcq	$0x00, %r8
        # A[3] * B[1]
        movq	8(%rcx), %rax
        mulq	24(%rsi)
        addq	%rax, %r9
        adcq	%rdx, %r10
        adcq	$0x00, %r8
        movq	%r9, 32(%rdi)
        # A[2] * B[3]
        movq	24(%rcx), %rax
        mulq	16(%rsi)
        xorq	%r9, %r9
        addq	%rax, %r10
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        # A[3] * B[2]
        movq	16(%rcx), %rax
        mulq	24(%rsi)
        addq	%rax, %r10
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        movq	%r10, 40(%rdi)
        # A[3] * B[3]
        movq	24(%rcx), %rax
        mulq	24(%rsi)
        addq	%rax, %r8
        adcq	%rdx, %r9
        movq	%r8, 48(%rdi)
        movq	%r9, 56(%rdi)
        movq	(%rsp), %rax
        movq	8(%rsp), %rdx
        movq	16(%rsp), %r8
        movq	24(%rsp), %r9
        movq	%rax, (%rdi)
        movq	%rdx, 8(%rdi)
        movq	%r8, 16(%rdi)
        movq	%r9, 24(%rdi)
        addq	$32, %rsp
        repz retq
#ifndef __APPLE__
.size	sp_256_mul_sm2_4,.-sp_256_mul_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Multiply a and b into r. (r = a * b)
 *
 * r   Result of multiplication.
 * a   First number to multiply.
 * b   Second number to multiply.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mul_avx2_sm2_4
.type	sp_256_mul_avx2_sm2_4,@function
.align	16
sp_256_mul_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mul_avx2_sm2_4
.p2align	4
_sp_256_mul_avx2_sm2_4:
#endif /* __APPLE__ */
        pushq	%rbp
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        movq	%rdx, %rbp
        movq	(%rsi), %rdx
        movq	8(%rbp), %r14
        # A[0] * B[0]
        mulxq	(%rbp), %r8, %r9
        xorq	%rbx, %rbx
        # A[0] * B[1]
        mulxq	%r14, %rax, %r10
        adcxq	%rax, %r9
        # A[0] * B[2]
        mulxq	16(%rbp), %rax, %r11
        adcxq	%rax, %r10
        # A[0] * B[3]
        mulxq	24(%rbp), %rax, %r12
        adcxq	%rax, %r11
        movq	8(%rsi), %rdx
        adcxq	%rbx, %r12
        # A[1] * B[0]
        mulxq	(%rbp), %rax, %rcx
        xorq	%rbx, %rbx
        adcxq	%rax, %r9
        # A[1] * B[1]
        mulxq	%r14, %rax, %r15
        adoxq	%rcx, %r10
        adcxq	%rax, %r10
        # A[1] * B[2]
        mulxq	16(%rbp), %rax, %rcx
        adoxq	%r15, %r11
        adcxq	%rax, %r11
        # A[1] * B[3]
        mulxq	24(%rbp), %rax, %r13
        adoxq	%rcx, %r12
        adcxq	%rax, %r12
        adoxq	%rbx, %r13
        movq	16(%rsi), %rdx
        adcxq	%rbx, %r13
        # A[2] * B[0]
        mulxq	(%rbp), %rax, %rcx
        xorq	%rbx, %rbx
        adcxq	%rax, %r10
        # A[2] * B[1]
        mulxq	%r14, %rax, %r15
        adoxq	%rcx, %r11
        adcxq	%rax, %r11
        # A[2] * B[2]
        mulxq	16(%rbp), %rax, %rcx
        adoxq	%r15, %r12
        adcxq	%rax, %r12
        # A[2] * B[3]
        mulxq	24(%rbp), %rax, %r14
        adoxq	%rcx, %r13
        adcxq	%rax, %r13
        adoxq	%rbx, %r14
        movq	24(%rsi), %rdx
        adcxq	%rbx, %r14
        # A[3] * B[0]
        mulxq	(%rbp), %rax, %rcx
        xorq	%rbx, %rbx
        adcxq	%rax, %r11
        # A[3] * B[1]
        mulxq	8(%rbp), %rax, %r15
        adoxq	%rcx, %r12
        adcxq	%rax, %r12
        # A[3] * B[2]
        mulxq	16(%rbp), %rax, %rcx
        adoxq	%r15, %r13
        adcxq	%rax, %r13
        # A[3] * B[3]
        mulxq	24(%rbp), %rax, %r15
        adoxq	%rcx, %r14
        adcxq	%rax, %r14
        adoxq	%rbx, %r15
        adcxq	%rbx, %r15
        movq	%r8, (%rdi)
        movq	%r9, 8(%rdi)
        movq	%r10, 16(%rdi)
        movq	%r11, 24(%rdi)
        movq	%r12, 32(%rdi)
        movq	%r13, 40(%rdi)
        movq	%r14, 48(%rdi)
        movq	%r15, 56(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbp
        repz retq
#ifndef __APPLE__
.size	sp_256_mul_avx2_sm2_4,.-sp_256_mul_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
#ifndef __APPLE__
.text
.globl	sp_256_sqr_sm2_4
.type	sp_256_sqr_sm2_4,@function
.align	16
sp_256_sqr_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_sqr_sm2_4
.p2align	4
_sp_256_sqr_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        subq	$32, %rsp
        # A[0] * A[0]
        movq	(%rsi), %rax
        mulq	%rax
        xorq	%r9, %r9
        movq	%rax, (%rsp)
        movq	%rdx, %r8
        # A[0] * A[1]
        movq	8(%rsi), %rax
        mulq	(%rsi)
        xorq	%rcx, %rcx
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %rcx
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %rcx
        movq	%r8, 8(%rsp)
        # A[0] * A[2]
        movq	16(%rsi), %rax
        mulq	(%rsi)
        xorq	%r8, %r8
        addq	%rax, %r9
        adcq	%rdx, %rcx
        adcq	$0x00, %r8
        addq	%rax, %r9
        adcq	%rdx, %rcx
        adcq	$0x00, %r8
        # A[1] * A[1]
        movq	8(%rsi), %rax
        mulq	%rax
        addq	%rax, %r9
        adcq	%rdx, %rcx
        adcq	$0x00, %r8
        movq	%r9, 16(%rsp)
        # A[0] * A[3]
        movq	24(%rsi), %rax
        mulq	(%rsi)
        xorq	%r9, %r9
        addq	%rax, %rcx
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        addq	%rax, %rcx
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        # A[1] * A[2]
        movq	16(%rsi), %rax
        mulq	8(%rsi)
        addq	%rax, %rcx
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        addq	%rax, %rcx
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        movq	%rcx, 24(%rsp)
        # A[1] * A[3]
        movq	24(%rsi), %rax
        mulq	8(%rsi)
        xorq	%rcx, %rcx
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %rcx
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %rcx
        # A[2] * A[2]
        movq	16(%rsi), %rax
        mulq	%rax
        addq	%rax, %r8
        adcq	%rdx, %r9
        adcq	$0x00, %rcx
        movq	%r8, 32(%rdi)
        # A[2] * A[3]
        movq	24(%rsi), %rax
        mulq	16(%rsi)
        xorq	%r8, %r8
        addq	%rax, %r9
        adcq	%rdx, %rcx
        adcq	$0x00, %r8
        addq	%rax, %r9
        adcq	%rdx, %rcx
        adcq	$0x00, %r8
        movq	%r9, 40(%rdi)
        # A[3] * A[3]
        movq	24(%rsi), %rax
        mulq	%rax
        addq	%rax, %rcx
        adcq	%rdx, %r8
        movq	%rcx, 48(%rdi)
        movq	%r8, 56(%rdi)
        movq	(%rsp), %rax
        movq	8(%rsp), %rdx
        movq	16(%rsp), %r10
        movq	24(%rsp), %r11
        movq	%rax, (%rdi)
        movq	%rdx, 8(%rdi)
        movq	%r10, 16(%rdi)
        movq	%r11, 24(%rdi)
        addq	$32, %rsp
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_sqr_sm2_4,.-sp_256_sqr_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Square a and put result in r. (r = a * a)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 */
#ifndef __APPLE__
.text
.globl	sp_256_sqr_avx2_sm2_4
.type	sp_256_sqr_avx2_sm2_4,@function
.align	16
sp_256_sqr_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_sqr_avx2_sm2_4
.p2align	4
_sp_256_sqr_avx2_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        xorq	%r8, %r8
        movq	(%rsi), %rdx
        movq	8(%rsi), %rcx
        movq	16(%rsi), %rbx
        movq	24(%rsi), %r15
        # A[0] * A[1]
        mulxq	%rcx, %r9, %r10
        # A[0] * A[2]
        mulxq	%rbx, %r8, %r11
        adoxq	%r8, %r10
        # A[0] * A[3]
        mulxq	%r15, %r8, %r12
        movq	%rcx, %rdx
        adoxq	%r8, %r11
        # A[1] * A[2]
        mulxq	%rbx, %r8, %rax
        movq	%r15, %rdx
        adcxq	%r8, %r11
        # A[1] * A[3]
        mulxq	%rcx, %r8, %r13
        movq	$0x00, %r15
        adoxq	%rax, %r12
        adcxq	%r8, %r12
        # A[2] * A[3]
        mulxq	%rbx, %r8, %r14
        adoxq	%r15, %r13
        adcxq	%r8, %r13
        adoxq	%r15, %r14
        adcxq	%r15, %r14
        # Double with Carry Flag
        xorq	%r15, %r15
        # A[0] * A[0]
        movq	(%rsi), %rdx
        mulxq	%rdx, %r8, %rax
        adcxq	%r9, %r9
        adcxq	%r10, %r10
        adoxq	%rax, %r9
        # A[1] * A[1]
        movq	8(%rsi), %rdx
        mulxq	%rdx, %rcx, %rbx
        adcxq	%r11, %r11
        adoxq	%rcx, %r10
        # A[2] * A[2]
        movq	16(%rsi), %rdx
        mulxq	%rdx, %rax, %rcx
        adcxq	%r12, %r12
        adoxq	%rbx, %r11
        adcxq	%r13, %r13
        adoxq	%rax, %r12
        adcxq	%r14, %r14
        # A[3] * A[3]
        movq	24(%rsi), %rdx
        mulxq	%rdx, %rax, %rbx
        adoxq	%rcx, %r13
        adcxq	%r15, %r15
        adoxq	%rax, %r14
        adoxq	%rbx, %r15
        movq	%r8, (%rdi)
        movq	%r9, 8(%rdi)
        movq	%r10, 16(%rdi)
        movq	%r11, 24(%rdi)
        movq	%r12, 32(%rdi)
        movq	%r13, 40(%rdi)
        movq	%r14, 48(%rdi)
        movq	%r15, 56(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_sqr_avx2_sm2_4,.-sp_256_sqr_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef __APPLE__
.text
.globl	sp_256_add_sm2_4
.type	sp_256_add_sm2_4,@function
.align	16
sp_256_add_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_add_sm2_4
.p2align	4
_sp_256_add_sm2_4:
#endif /* __APPLE__ */
        xorq	%rax, %rax
        movq	(%rsi), %rcx
        movq	8(%rsi), %r8
        movq	16(%rsi), %r9
        movq	24(%rsi), %r10
        addq	(%rdx), %rcx
        adcq	8(%rdx), %r8
        adcq	16(%rdx), %r9
        adcq	24(%rdx), %r10
        movq	%rcx, (%rdi)
        movq	%r8, 8(%rdi)
        movq	%r9, 16(%rdi)
        movq	%r10, 24(%rdi)
        adcq	$0x00, %rax
        repz retq
#ifndef __APPLE__
.size	sp_256_add_sm2_4,.-sp_256_add_sm2_4
#endif /* __APPLE__ */
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
#ifndef __APPLE__
.text
.globl	sp_256_sub_sm2_4
.type	sp_256_sub_sm2_4,@function
.align	16
sp_256_sub_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_sub_sm2_4
.p2align	4
_sp_256_sub_sm2_4:
#endif /* __APPLE__ */
        xorq	%rax, %rax
        movq	(%rsi), %rcx
        movq	8(%rsi), %r8
        movq	16(%rsi), %r9
        movq	24(%rsi), %r10
        subq	(%rdx), %rcx
        sbbq	8(%rdx), %r8
        sbbq	16(%rdx), %r9
        sbbq	24(%rdx), %r10
        movq	%rcx, (%rdi)
        movq	%r8, 8(%rdi)
        movq	%r9, 16(%rdi)
        movq	%r10, 24(%rdi)
        sbbq	%rax, %rax
        repz retq
#ifndef __APPLE__
.size	sp_256_sub_sm2_4,.-sp_256_sub_sm2_4
#endif /* __APPLE__ */
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
#ifndef __APPLE__
.text
.globl	sp_256_sub_in_place_sm2_4
.type	sp_256_sub_in_place_sm2_4,@function
.align	16
sp_256_sub_in_place_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_sub_in_place_sm2_4
.p2align	4
_sp_256_sub_in_place_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rdx
        movq	8(%rsi), %rcx
        movq	16(%rsi), %r8
        movq	24(%rsi), %r9
        subq	%rdx, (%rdi)
        sbbq	%rcx, 8(%rdi)
        sbbq	%r8, 16(%rdi)
        sbbq	%r9, 24(%rdi)
        sbbq	%rax, %rax
        repz retq
#ifndef __APPLE__
.size	sp_256_sub_in_place_sm2_4,.-sp_256_sub_in_place_sm2_4
#endif /* __APPLE__ */
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef __APPLE__
.text
.globl	sp_256_cond_sub_sm2_4
.type	sp_256_cond_sub_sm2_4,@function
.align	16
sp_256_cond_sub_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_cond_sub_sm2_4
.p2align	4
_sp_256_cond_sub_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        movq	(%rdx), %r12
        movq	8(%rdx), %r13
        movq	16(%rdx), %r14
        movq	24(%rdx), %r15
        andq	%rcx, %r12
        andq	%rcx, %r13
        andq	%rcx, %r14
        andq	%rcx, %r15
        movq	(%rsi), %r8
        movq	8(%rsi), %r9
        movq	16(%rsi), %r10
        movq	24(%rsi), %r11
        subq	%r12, %r8
        sbbq	%r13, %r9
        sbbq	%r14, %r10
        sbbq	%r15, %r11
        movq	%r8, (%rdi)
        movq	%r9, 8(%rdi)
        movq	%r10, 16(%rdi)
        movq	%r11, 24(%rdi)
        sbbq	%rax, %rax
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_cond_sub_sm2_4,.-sp_256_cond_sub_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
#ifndef __APPLE__
.text
.globl	sp_256_cond_sub_avx2_sm2_4
.type	sp_256_cond_sub_avx2_sm2_4,@function
.align	16
sp_256_cond_sub_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_cond_sub_avx2_sm2_4
.p2align	4
_sp_256_cond_sub_avx2_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        movq	(%rdx), %r12
        movq	8(%rdx), %r13
        movq	16(%rdx), %r14
        movq	24(%rdx), %r15
        andq	%rcx, %r12
        andq	%rcx, %r13
        andq	%rcx, %r14
        andq	%rcx, %r15
        movq	(%rsi), %r8
        movq	8(%rsi), %r9
        movq	16(%rsi), %r10
        movq	24(%rsi), %r11
        subq	%r12, %r8
        sbbq	%r13, %r9
        sbbq	%r14, %r10
        sbbq	%r15, %r11
        movq	%r8, (%rdi)
        movq	%r9, 8(%rdi)
        movq	%r10, 16(%rdi)
        movq	%r11, 24(%rdi)
        sbbq	%rax, %rax
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_cond_sub_avx2_sm2_4,.-sp_256_cond_sub_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mul_d_sm2_4
.type	sp_256_mul_d_sm2_4,@function
.align	16
sp_256_mul_d_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mul_d_sm2_4
.p2align	4
_sp_256_mul_d_sm2_4:
#endif /* __APPLE__ */
        movq	%rdx, %rcx
        # A[0] * B
        movq	%rcx, %rax
        xorq	%r10, %r10
        mulq	(%rsi)
        movq	%rax, %r8
        movq	%rdx, %r9
        movq	%r8, (%rdi)
        # A[1] * B
        movq	%rcx, %rax
        xorq	%r8, %r8
        mulq	8(%rsi)
        addq	%rax, %r9
        movq	%r9, 8(%rdi)
        adcq	%rdx, %r10
        adcq	$0x00, %r8
        # A[2] * B
        movq	%rcx, %rax
        xorq	%r9, %r9
        mulq	16(%rsi)
        addq	%rax, %r10
        movq	%r10, 16(%rdi)
        adcq	%rdx, %r8
        adcq	$0x00, %r9
        # A[3] * B
        movq	%rcx, %rax
        mulq	24(%rsi)
        addq	%rax, %r8
        adcq	%rdx, %r9
        movq	%r8, 24(%rdi)
        movq	%r9, 32(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mul_d_sm2_4,.-sp_256_mul_d_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mul_d_avx2_sm2_4
.type	sp_256_mul_d_avx2_sm2_4,@function
.align	16
sp_256_mul_d_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mul_d_avx2_sm2_4
.p2align	4
_sp_256_mul_d_avx2_sm2_4:
#endif /* __APPLE__ */
        movq	%rdx, %rax
        # A[0] * B
        movq	%rax, %rdx
        xorq	%r11, %r11
        mulxq	(%rsi), %r9, %r10
        movq	%r9, (%rdi)
        # A[1] * B
        mulxq	8(%rsi), %rcx, %r8
        movq	%r11, %r9
        adcxq	%rcx, %r10
        adoxq	%r8, %r9
        movq	%r10, 8(%rdi)
        # A[2] * B
        mulxq	16(%rsi), %rcx, %r8
        movq	%r11, %r10
        adcxq	%rcx, %r9
        adoxq	%r8, %r10
        movq	%r9, 16(%rdi)
        # A[3] * B
        mulxq	24(%rsi), %rcx, %r8
        movq	%r11, %r9
        adcxq	%rcx, %r10
        adoxq	%r8, %r9
        adcxq	%r11, %r9
        movq	%r10, 24(%rdi)
        movq	%r9, 32(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mul_d_avx2_sm2_4,.-sp_256_mul_d_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#ifdef _WIN64
/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
#ifndef __APPLE__
.text
.globl	div_256_word_asm_4
.type	div_256_word_asm_4,@function
.align	16
div_256_word_asm_4:
#else
.section	__TEXT,__text
.globl	_div_256_word_asm_4
.p2align	4
_div_256_word_asm_4:
#endif /* __APPLE__ */
        movq	%rdx, %rcx
        movq	%rsi, %rax
        movq	%rdi, %rdx
        divq	%rcx
        repz retq
#ifndef __APPLE__
.size	div_256_word_asm_4,.-div_256_word_asm_4
#endif /* __APPLE__ */
#endif /* _WIN64 */
/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
#ifndef __APPLE__
.text
.globl	sp_256_cmp_sm2_4
.type	sp_256_cmp_sm2_4,@function
.align	16
sp_256_cmp_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_cmp_sm2_4
.p2align	4
_sp_256_cmp_sm2_4:
#endif /* __APPLE__ */
        xorq	%rcx, %rcx
        movq	$-1, %rdx
        movq	$-1, %rax
        movq	$0x01, %r8
        movq	24(%rdi), %r9
        movq	24(%rsi), %r10
        andq	%rdx, %r9
        andq	%rdx, %r10
        subq	%r10, %r9
        cmova	%r8, %rax
        cmovc	%rdx, %rax
        cmovnz	%rcx, %rdx
        movq	16(%rdi), %r9
        movq	16(%rsi), %r10
        andq	%rdx, %r9
        andq	%rdx, %r10
        subq	%r10, %r9
        cmova	%r8, %rax
        cmovc	%rdx, %rax
        cmovnz	%rcx, %rdx
        movq	8(%rdi), %r9
        movq	8(%rsi), %r10
        andq	%rdx, %r9
        andq	%rdx, %r10
        subq	%r10, %r9
        cmova	%r8, %rax
        cmovc	%rdx, %rax
        cmovnz	%rcx, %rdx
        movq	(%rdi), %r9
        movq	(%rsi), %r10
        andq	%rdx, %r9
        andq	%rdx, %r10
        subq	%r10, %r9
        cmova	%r8, %rax
        cmovc	%rdx, %rax
        cmovnz	%rcx, %rdx
        xorq	%rdx, %rax
        repz retq
#ifndef __APPLE__
.size	sp_256_cmp_sm2_4,.-sp_256_cmp_sm2_4
#endif /* __APPLE__ */
/* Conditionally copy a into r using the mask m.
 * m is -1 to copy and 0 when not.
 *
 * r  A single precision number to copy over.
 * a  A single precision number to copy.
 * m  Mask value to apply.
 */
#ifndef __APPLE__
.text
.globl	sp_256_cond_copy_sm2_4
.type	sp_256_cond_copy_sm2_4,@function
.align	16
sp_256_cond_copy_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_cond_copy_sm2_4
.p2align	4
_sp_256_cond_copy_sm2_4:
#endif /* __APPLE__ */
        movq	(%rdi), %rax
        movq	8(%rdi), %rcx
        movq	16(%rdi), %r8
        movq	24(%rdi), %r9
        xorq	(%rsi), %rax
        xorq	8(%rsi), %rcx
        xorq	16(%rsi), %r8
        xorq	24(%rsi), %r9
        andq	%rdx, %rax
        andq	%rdx, %rcx
        andq	%rdx, %r8
        andq	%rdx, %r9
        xorq	%rax, (%rdi)
        xorq	%rcx, 8(%rdi)
        xorq	%r8, 16(%rdi)
        xorq	%r9, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_cond_copy_sm2_4,.-sp_256_cond_copy_sm2_4
#endif /* __APPLE__ */
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_mul_sm2_4
.type	sp_256_mont_mul_sm2_4,@function
.align	16
sp_256_mont_mul_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_mul_sm2_4
.p2align	4
_sp_256_mont_mul_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        movq	%rdx, %r8
        #  A[0] * B[0]
        movq	(%r8), %rax
        mulq	(%rsi)
        movq	%rax, %r9
        movq	%rdx, %r10
        #  A[0] * B[1]
        movq	8(%r8), %rax
        mulq	(%rsi)
        xorq	%r11, %r11
        addq	%rax, %r10
        adcq	%rdx, %r11
        #  A[1] * B[0]
        movq	(%r8), %rax
        mulq	8(%rsi)
        xorq	%r12, %r12
        addq	%rax, %r10
        adcq	%rdx, %r11
        adcq	$0x00, %r12
        #  A[0] * B[2]
        movq	16(%r8), %rax
        mulq	(%rsi)
        addq	%rax, %r11
        adcq	%rdx, %r12
        #  A[1] * B[1]
        movq	8(%r8), %rax
        mulq	8(%rsi)
        xorq	%r13, %r13
        addq	%rax, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        #  A[2] * B[0]
        movq	(%r8), %rax
        mulq	16(%rsi)
        addq	%rax, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        #  A[0] * B[3]
        movq	24(%r8), %rax
        mulq	(%rsi)
        xorq	%r14, %r14
        addq	%rax, %r12
        adcq	%rdx, %r13
        adcq	$0x00, %r14
        #  A[1] * B[2]
        movq	16(%r8), %rax
        mulq	8(%rsi)
        addq	%rax, %r12
        adcq	%rdx, %r13
        adcq	$0x00, %r14
        #  A[2] * B[1]
        movq	8(%r8), %rax
        mulq	16(%rsi)
        addq	%rax, %r12
        adcq	%rdx, %r13
        adcq	$0x00, %r14
        #  A[3] * B[0]
        movq	(%r8), %rax
        mulq	24(%rsi)
        addq	%rax, %r12
        adcq	%rdx, %r13
        adcq	$0x00, %r14
        #  A[1] * B[3]
        movq	24(%r8), %rax
        mulq	8(%rsi)
        xorq	%r15, %r15
        addq	%rax, %r13
        adcq	%rdx, %r14
        adcq	$0x00, %r15
        #  A[2] * B[2]
        movq	16(%r8), %rax
        mulq	16(%rsi)
        addq	%rax, %r13
        adcq	%rdx, %r14
        adcq	$0x00, %r15
        #  A[3] * B[1]
        movq	8(%r8), %rax
        mulq	24(%rsi)
        addq	%rax, %r13
        adcq	%rdx, %r14
        adcq	$0x00, %r15
        #  A[2] * B[3]
        movq	24(%r8), %rax
        mulq	16(%rsi)
        xorq	%rbx, %rbx
        addq	%rax, %r14
        adcq	%rdx, %r15
        adcq	$0x00, %rbx
        #  A[3] * B[2]
        movq	16(%r8), %rax
        mulq	24(%rsi)
        addq	%rax, %r14
        adcq	%rdx, %r15
        adcq	$0x00, %rbx
        #  A[3] * B[3]
        movq	24(%r8), %rax
        mulq	24(%rsi)
        addq	%rax, %r15
        adcq	%rdx, %rbx
        # Start Reduction
        # mu = a[0..3] + a[0..2] << 64 - a[0..2] << 32 << 64
        #    + a[0..1] << 128 - (a[0..1] * 2) << 32 << 128
        #    + (a[0..0] * 2) << 192 - (a[0..0] * 4) << 32 << 192
        # mu = a[0..3]
        movq	%r12, %rdx
        #   + (a[0..0] * 2) << 192
        addq	%r9, %rdx
        movq	%r11, %r8
        addq	%r9, %rdx
        #   + a[0..1) << 128
        addq	%r9, %r8
        movq	%r10, %rsi
        adcq	%r10, %rdx
        #   + a[0..2] << 64
        addq	%r9, %rsi
        movq	%r9, %rax
        adcq	%r10, %r8
        adcq	%r11, %rdx
        #   a[0..2] << 32
        shlq	$32, %r9
        shldq	$32, %r10, %r11
        shldq	$32, %rax, %r10
        #   - (a[0..1] * 2) << 32 << 128
        subq	%r9, %r8
        sbbq	%r10, %rdx
        subq	%r9, %r8
        sbbq	%r10, %rdx
        #   - a[0..2] << 32 << 64
        subq	%r9, %rsi
        sbbq	%r10, %r8
        sbbq	%r11, %rdx
        #   - (a[0..0] * 4) << 32 << 192
        movq	%r9, %r11
        shlq	$2, %r11
        subq	%r11, %rdx
        # a += (mu << 256) - (mu << 224) - (mu << 96) + (mu << 64) - mu
        #   a += mu << 256
        xorq	%r9, %r9
        addq	%rax, %r13
        adcq	%rsi, %r14
        adcq	%r8, %r15
        adcq	%rdx, %rbx
        sbbq	$0x00, %r9
        #   a += mu << 64
        addq	%r8, %r12
        adcq	%rdx, %r13
        adcq	$0x00, %r14
        adcq	$0x00, %r15
        adcq	$0x00, %rbx
        sbbq	$0x00, %r9
        # mu <<= 32
        movq	%rdx, %rcx
        shldq	$32, %r8, %rdx
        shldq	$32, %rsi, %r8
        shldq	$32, %rax, %rsi
        shrq	$32, %rcx
        shlq	$32, %rax
        #   a -= (mu << 32) << 64
        subq	%r8, %r12
        sbbq	%rdx, %r13
        sbbq	%rcx, %r14
        sbbq	$0x00, %r15
        sbbq	$0x00, %rbx
        adcq	$0x00, %r9
        #   a -= (mu << 32) << 192
        subq	%rax, %r12
        sbbq	%rsi, %r13
        sbbq	%r8, %r14
        sbbq	%rdx, %r15
        sbbq	%rcx, %rbx
        adcq	$0x00, %r9
        movq	$0xffffffff00000000, %rax
        movq	$0xfffffffeffffffff, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r9, %rax
        #  m[2] = -1 & mask = mask
        andq	%r9, %rsi
        subq	%r9, %r13
        sbbq	%rax, %r14
        sbbq	%r9, %r15
        sbbq	%rsi, %rbx
        movq	%r13, (%rdi)
        movq	%r14, 8(%rdi)
        movq	%r15, 16(%rdi)
        movq	%rbx, 24(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_mul_sm2_4,.-sp_256_mont_mul_sm2_4
#endif /* __APPLE__ */
/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_sqr_sm2_4
.type	sp_256_mont_sqr_sm2_4,@function
.align	16
sp_256_mont_sqr_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_sqr_sm2_4
.p2align	4
_sp_256_mont_sqr_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        #  A[0] * A[1]
        movq	(%rsi), %rax
        mulq	8(%rsi)
        movq	%rax, %r9
        movq	%rdx, %r10
        #  A[0] * A[2]
        movq	(%rsi), %rax
        mulq	16(%rsi)
        xorq	%r11, %r11
        addq	%rax, %r10
        adcq	%rdx, %r11
        #  A[0] * A[3]
        movq	(%rsi), %rax
        mulq	24(%rsi)
        xorq	%r12, %r12
        addq	%rax, %r11
        adcq	%rdx, %r12
        #  A[1] * A[2]
        movq	8(%rsi), %rax
        mulq	16(%rsi)
        xorq	%r13, %r13
        addq	%rax, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        #  A[1] * A[3]
        movq	8(%rsi), %rax
        mulq	24(%rsi)
        addq	%rax, %r12
        adcq	%rdx, %r13
        #  A[2] * A[3]
        movq	16(%rsi), %rax
        mulq	24(%rsi)
        xorq	%r14, %r14
        addq	%rax, %r13
        adcq	%rdx, %r14
        # Double
        xorq	%r15, %r15
        addq	%r9, %r9
        adcq	%r10, %r10
        adcq	%r11, %r11
        adcq	%r12, %r12
        adcq	%r13, %r13
        adcq	%r14, %r14
        adcq	$0x00, %r15
        #  A[0] * A[0]
        movq	(%rsi), %rax
        mulq	%rax
        movq	%rax, %rax
        movq	%rdx, %rdx
        movq	%rax, %r8
        movq	%rdx, %rbx
        #  A[1] * A[1]
        movq	8(%rsi), %rax
        mulq	%rax
        movq	%rax, %rax
        movq	%rdx, %rdx
        addq	%rbx, %r9
        adcq	%rax, %r10
        adcq	$0x00, %rdx
        movq	%rdx, %rbx
        #  A[2] * A[2]
        movq	16(%rsi), %rax
        mulq	%rax
        movq	%rax, %rax
        movq	%rdx, %rdx
        addq	%rbx, %r11
        adcq	%rax, %r12
        adcq	$0x00, %rdx
        movq	%rdx, %rbx
        #  A[3] * A[3]
        movq	24(%rsi), %rax
        mulq	%rax
        movq	%rax, %rax
        movq	%rdx, %rdx
        addq	%rbx, %r13
        adcq	%rax, %r14
        adcq	%rdx, %r15
        # Start Reduction
        # mu = a[0..3] + a[0..2] << 64 - a[0..2] << 32 << 64
        #    + a[0..1] << 128 - (a[0..1] * 2) << 32 << 128
        #    + (a[0..0] * 2) << 192 - (a[0..0] * 4) << 32 << 192
        # mu = a[0..3]
        movq	%r11, %rdx
        #   + (a[0..0] * 2) << 192
        addq	%r8, %rdx
        movq	%r10, %rbx
        addq	%r8, %rdx
        #   + a[0..1) << 128
        addq	%r8, %rbx
        movq	%r9, %rsi
        adcq	%r9, %rdx
        #   + a[0..2] << 64
        addq	%r8, %rsi
        movq	%r8, %rax
        adcq	%r9, %rbx
        adcq	%r10, %rdx
        #   a[0..2] << 32
        shlq	$32, %r8
        shldq	$32, %r9, %r10
        shldq	$32, %rax, %r9
        #   - (a[0..1] * 2) << 32 << 128
        subq	%r8, %rbx
        sbbq	%r9, %rdx
        subq	%r8, %rbx
        sbbq	%r9, %rdx
        #   - a[0..2] << 32 << 64
        subq	%r8, %rsi
        sbbq	%r9, %rbx
        sbbq	%r10, %rdx
        #   - (a[0..0] * 4) << 32 << 192
        movq	%r8, %r10
        shlq	$2, %r10
        subq	%r10, %rdx
        # a += (mu << 256) - (mu << 224) - (mu << 96) + (mu << 64) - mu
        #   a += mu << 256
        xorq	%r8, %r8
        addq	%rax, %r12
        adcq	%rsi, %r13
        adcq	%rbx, %r14
        adcq	%rdx, %r15
        sbbq	$0x00, %r8
        #   a += mu << 64
        addq	%rbx, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        adcq	$0x00, %r14
        adcq	$0x00, %r15
        sbbq	$0x00, %r8
        # mu <<= 32
        movq	%rdx, %rcx
        shldq	$32, %rbx, %rdx
        shldq	$32, %rsi, %rbx
        shldq	$32, %rax, %rsi
        shrq	$32, %rcx
        shlq	$32, %rax
        #   a -= (mu << 32) << 64
        subq	%rbx, %r11
        sbbq	%rdx, %r12
        sbbq	%rcx, %r13
        sbbq	$0x00, %r14
        sbbq	$0x00, %r15
        adcq	$0x00, %r8
        #   a -= (mu << 32) << 192
        subq	%rax, %r11
        sbbq	%rsi, %r12
        sbbq	%rbx, %r13
        sbbq	%rdx, %r14
        sbbq	%rcx, %r15
        adcq	$0x00, %r8
        movq	$0xffffffff00000000, %rax
        movq	$0xfffffffeffffffff, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r8, %rax
        #  m[2] = -1 & mask = mask
        andq	%r8, %rsi
        subq	%r8, %r12
        sbbq	%rax, %r13
        sbbq	%r8, %r14
        sbbq	%rsi, %r15
        movq	%r12, (%rdi)
        movq	%r13, 8(%rdi)
        movq	%r14, 16(%rdi)
        movq	%r15, 24(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_sqr_sm2_4,.-sp_256_mont_sqr_sm2_4
#endif /* __APPLE__ */
/* Reduce the number back to 256 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_reduce_sm2_4
.type	sp_256_mont_reduce_sm2_4,@function
.align	16
sp_256_mont_reduce_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_reduce_sm2_4
.p2align	4
_sp_256_mont_reduce_sm2_4:
#endif /* __APPLE__ */
        pushq	%rbx
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        movq	(%rdi), %r8
        movq	8(%rdi), %r9
        movq	16(%rdi), %r10
        movq	24(%rdi), %r11
        movq	32(%rdi), %r12
        movq	40(%rdi), %r13
        movq	48(%rdi), %r14
        movq	56(%rdi), %r15
        # Start Reduction
        # mu = a[0..3] + a[0..2] << 64 - a[0..2] << 32 << 64
        #    + a[0..1] << 128 - (a[0..1] * 2) << 32 << 128
        #    + (a[0..0] * 2) << 192 - (a[0..0] * 4) << 32 << 192
        # mu = a[0..3]
        movq	%r11, %rdx
        #   + (a[0..0] * 2) << 192
        addq	%r8, %rdx
        movq	%r10, %rcx
        addq	%r8, %rdx
        #   + a[0..1) << 128
        addq	%r8, %rcx
        movq	%r9, %rbx
        adcq	%r9, %rdx
        #   + a[0..2] << 64
        addq	%r8, %rbx
        movq	%r8, %rax
        adcq	%r9, %rcx
        adcq	%r10, %rdx
        #   a[0..2] << 32
        shlq	$32, %r8
        shldq	$32, %r9, %r10
        shldq	$32, %rax, %r9
        #   - (a[0..1] * 2) << 32 << 128
        subq	%r8, %rcx
        sbbq	%r9, %rdx
        subq	%r8, %rcx
        sbbq	%r9, %rdx
        #   - a[0..2] << 32 << 64
        subq	%r8, %rbx
        sbbq	%r9, %rcx
        sbbq	%r10, %rdx
        #   - (a[0..0] * 4) << 32 << 192
        movq	%r8, %r10
        shlq	$2, %r10
        subq	%r10, %rdx
        # a += (mu << 256) - (mu << 224) - (mu << 96) + (mu << 64) - mu
        #   a += mu << 256
        xorq	%r8, %r8
        addq	%rax, %r12
        adcq	%rbx, %r13
        adcq	%rcx, %r14
        adcq	%rdx, %r15
        sbbq	$0x00, %r8
        #   a += mu << 64
        addq	%rcx, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        adcq	$0x00, %r14
        adcq	$0x00, %r15
        sbbq	$0x00, %r8
        # mu <<= 32
        movq	%rdx, %rsi
        shldq	$32, %rcx, %rdx
        shldq	$32, %rbx, %rcx
        shldq	$32, %rax, %rbx
        shrq	$32, %rsi
        shlq	$32, %rax
        #   a -= (mu << 32) << 64
        subq	%rcx, %r11
        sbbq	%rdx, %r12
        sbbq	%rsi, %r13
        sbbq	$0x00, %r14
        sbbq	$0x00, %r15
        adcq	$0x00, %r8
        #   a -= (mu << 32) << 192
        subq	%rax, %r11
        sbbq	%rbx, %r12
        sbbq	%rcx, %r13
        sbbq	%rdx, %r14
        sbbq	%rsi, %r15
        adcq	$0x00, %r8
        movq	$0xffffffff00000000, %rax
        movq	$0xfffffffeffffffff, %rbx
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r8, %rax
        #  m[2] = -1 & mask = mask
        andq	%r8, %rbx
        subq	%r8, %r12
        sbbq	%rax, %r13
        sbbq	%r8, %r14
        sbbq	%rbx, %r15
        movq	%r12, (%rdi)
        movq	%r13, 8(%rdi)
        movq	%r14, 16(%rdi)
        movq	%r15, 24(%rdi)
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbx
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_reduce_sm2_4,.-sp_256_mont_reduce_sm2_4
#endif /* __APPLE__ */
/* Reduce the number back to 256 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_reduce_order_sm2_4
.type	sp_256_mont_reduce_order_sm2_4,@function
.align	16
sp_256_mont_reduce_order_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_reduce_order_sm2_4
.p2align	4
_sp_256_mont_reduce_order_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        movq	%rdx, %rcx
        # i = 0
        xorq	%r14, %r14
        movq	$4, %r8
        movq	%rdi, %r13
L_mont_loop_4:
        # mu = a[i] * mp
        movq	(%r13), %r12
        imulq	%rcx, %r12
        # a[i+0] += m[0] * mu
        movq	(%rsi), %rax
        movq	8(%rsi), %r10
        mulq	%r12
        movq	(%r13), %r15
        addq	%rax, %r15
        movq	%rdx, %r9
        movq	%r15, (%r13)
        adcq	$0x00, %r9
        # a[i+1] += m[1] * mu
        movq	%r10, %rax
        mulq	%r12
        movq	16(%rsi), %r10
        movq	8(%r13), %r15
        addq	%r9, %rax
        movq	%rdx, %r11
        adcq	$0x00, %r11
        addq	%rax, %r15
        movq	%r15, 8(%r13)
        adcq	$0x00, %r11
        # a[i+2] += m[2] * mu
        movq	%r10, %rax
        mulq	%r12
        movq	24(%rsi), %r10
        movq	16(%r13), %r15
        addq	%r11, %rax
        movq	%rdx, %r9
        adcq	$0x00, %r9
        addq	%rax, %r15
        movq	%r15, 16(%r13)
        adcq	$0x00, %r9
        # a[i+3] += m[3] * mu
        movq	%r10, %rax
        mulq	%r12
        movq	24(%r13), %r15
        addq	%r9, %rax
        adcq	%r14, %rdx
        movq	$0x00, %r14
        adcq	$0x00, %r14
        addq	%rax, %r15
        movq	%r15, 24(%r13)
        adcq	%rdx, 32(%r13)
        adcq	$0x00, %r14
        # i += 1
        addq	$8, %r13
        decq	%r8
        jnz	L_mont_loop_4
        xorq	%rax, %rax
        movq	32(%rdi), %rdx
        movq	40(%rdi), %r8
        movq	48(%rdi), %r15
        movq	56(%rdi), %r9
        subq	%r14, %rax
        movq	(%rsi), %r10
        movq	8(%rsi), %r11
        movq	16(%rsi), %r12
        movq	24(%rsi), %r13
        andq	%rax, %r10
        andq	%rax, %r11
        andq	%rax, %r12
        andq	%rax, %r13
        subq	%r10, %rdx
        sbbq	%r11, %r8
        sbbq	%r12, %r15
        sbbq	%r13, %r9
        movq	%rdx, (%rdi)
        movq	%r8, 8(%rdi)
        movq	%r15, 16(%rdi)
        movq	%r9, 24(%rdi)
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_reduce_order_sm2_4,.-sp_256_mont_reduce_order_sm2_4
#endif /* __APPLE__ */
/* Add two Montgomery form numbers (r = a + b % m).
 *
 * r   Result of addition.
 * a   First number to add in Montgomery form.
 * b   Second number to add in Montgomery form.
 * m   Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_add_sm2_4
.type	sp_256_mont_add_sm2_4,@function
.align	16
sp_256_mont_add_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_add_sm2_4
.p2align	4
_sp_256_mont_add_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rax
        movq	8(%rsi), %rcx
        movq	16(%rsi), %r8
        movq	24(%rsi), %r9
        addq	(%rdx), %rax
        movq	$0xffffffff00000000, %r10
        adcq	8(%rdx), %rcx
        movq	$0xfffffffeffffffff, %r11
        adcq	16(%rdx), %r8
        adcq	24(%rdx), %r9
        sbbq	%rsi, %rsi
        andq	%rsi, %r10
        andq	%rsi, %r11
        subq	%rsi, %rax
        sbbq	%r10, %rcx
        sbbq	%rsi, %r8
        sbbq	%r11, %r9
        adcq	$0x00, %rsi
        andq	%rsi, %r10
        andq	%rsi, %r11
        subq	%rsi, %rax
        sbbq	%r10, %rcx
        movq	%rax, (%rdi)
        sbbq	%rsi, %r8
        movq	%rcx, 8(%rdi)
        sbbq	%r11, %r9
        movq	%r8, 16(%rdi)
        movq	%r9, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_add_sm2_4,.-sp_256_mont_add_sm2_4
#endif /* __APPLE__ */
/* Double a Montgomery form number (r = a + a % m).
 *
 * r   Result of doubling.
 * a   Number to double in Montgomery form.
 * m   Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_dbl_sm2_4
.type	sp_256_mont_dbl_sm2_4,@function
.align	16
sp_256_mont_dbl_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_dbl_sm2_4
.p2align	4
_sp_256_mont_dbl_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rdx
        movq	8(%rsi), %rax
        movq	16(%rsi), %rcx
        movq	24(%rsi), %r8
        addq	%rdx, %rdx
        movq	$0xffffffff00000000, %r9
        adcq	%rax, %rax
        movq	$0xfffffffeffffffff, %r10
        adcq	%rcx, %rcx
        movq	%r8, %r11
        adcq	%r8, %r8
        sarq	$63, %r11
        andq	%r11, %r9
        andq	%r11, %r10
        subq	%r11, %rdx
        sbbq	%r9, %rax
        sbbq	%r11, %rcx
        sbbq	%r10, %r8
        adcq	$0x00, %r11
        andq	%r11, %r9
        andq	%r11, %r10
        subq	%r11, %rdx
        sbbq	%r9, %rax
        movq	%rdx, (%rdi)
        sbbq	%r11, %rcx
        movq	%rax, 8(%rdi)
        sbbq	%r10, %r8
        movq	%rcx, 16(%rdi)
        movq	%r8, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_dbl_sm2_4,.-sp_256_mont_dbl_sm2_4
#endif /* __APPLE__ */
/* Triple a Montgomery form number (r = a + a + a % m).
 *
 * r   Result of Tripling.
 * a   Number to triple in Montgomery form.
 * m   Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_tpl_sm2_4
.type	sp_256_mont_tpl_sm2_4,@function
.align	16
sp_256_mont_tpl_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_tpl_sm2_4
.p2align	4
_sp_256_mont_tpl_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rdx
        movq	8(%rsi), %rax
        movq	16(%rsi), %rcx
        movq	24(%rsi), %r8
        addq	%rdx, %rdx
        movq	$0xffffffff00000000, %r9
        adcq	%rax, %rax
        movq	$0xfffffffeffffffff, %r10
        adcq	%rcx, %rcx
        adcq	%r8, %r8
        sbbq	%r11, %r11
        andq	%r11, %r9
        andq	%r11, %r10
        subq	%r11, %rdx
        sbbq	%r9, %rax
        sbbq	%r11, %rcx
        sbbq	%r10, %r8
        adcq	$0x00, %r11
        andq	%r11, %r9
        andq	%r11, %r10
        subq	%r11, %rdx
        sbbq	%r9, %rax
        sbbq	%r11, %rcx
        sbbq	%r10, %r8
        addq	(%rsi), %rdx
        movq	$0xffffffff00000000, %r9
        adcq	8(%rsi), %rax
        movq	$0xfffffffeffffffff, %r10
        adcq	16(%rsi), %rcx
        adcq	24(%rsi), %r8
        sbbq	$0x00, %r11
        andq	%r11, %r9
        andq	%r11, %r10
        subq	%r11, %rdx
        sbbq	%r9, %rax
        sbbq	%r11, %rcx
        sbbq	%r10, %r8
        adcq	$0x00, %r11
        andq	%r11, %r9
        andq	%r11, %r10
        subq	%r11, %rdx
        sbbq	%r9, %rax
        movq	%rdx, (%rdi)
        sbbq	%r11, %rcx
        movq	%rax, 8(%rdi)
        sbbq	%r10, %r8
        movq	%rcx, 16(%rdi)
        movq	%r8, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_tpl_sm2_4,.-sp_256_mont_tpl_sm2_4
#endif /* __APPLE__ */
/* Subtract two Montgomery form numbers (r = a - b % m).
 *
 * r   Result of subtration.
 * a   Number to subtract from in Montgomery form.
 * b   Number to subtract with in Montgomery form.
 * m   Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_sub_sm2_4
.type	sp_256_mont_sub_sm2_4,@function
.align	16
sp_256_mont_sub_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_sub_sm2_4
.p2align	4
_sp_256_mont_sub_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rax
        movq	8(%rsi), %rcx
        movq	16(%rsi), %r8
        movq	24(%rsi), %r9
        subq	(%rdx), %rax
        movq	$0xffffffff00000000, %r10
        sbbq	8(%rdx), %rcx
        movq	$0xfffffffeffffffff, %r11
        sbbq	16(%rdx), %r8
        sbbq	24(%rdx), %r9
        sbbq	%rsi, %rsi
        andq	%rsi, %r10
        andq	%rsi, %r11
        addq	%rsi, %rax
        adcq	%r10, %rcx
        adcq	%rsi, %r8
        adcq	%r11, %r9
        adcq	$0x00, %rsi
        andq	%rsi, %r10
        andq	%rsi, %r11
        addq	%rsi, %rax
        adcq	%r10, %rcx
        movq	%rax, (%rdi)
        adcq	%rsi, %r8
        movq	%rcx, 8(%rdi)
        adcq	%r11, %r9
        movq	%r8, 16(%rdi)
        movq	%r9, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_sub_sm2_4,.-sp_256_mont_sub_sm2_4
#endif /* __APPLE__ */
/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 *
 * r  Result of division by 2.
 * a  Number to divide.
 * m  Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_div2_sm2_4
.type	sp_256_mont_div2_sm2_4,@function
.align	16
sp_256_mont_div2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_div2_sm2_4
.p2align	4
_sp_256_mont_div2_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rdx
        movq	8(%rsi), %rax
        movq	16(%rsi), %rcx
        movq	24(%rsi), %r8
        movq	$0xffffffff00000000, %r9
        movq	$0xfffffffeffffffff, %r10
        movq	%rdx, %r11
        andq	$0x01, %r11
        negq	%r11
        andq	%r11, %r9
        andq	%r11, %r10
        addq	%r11, %rdx
        adcq	%r9, %rax
        adcq	%r11, %rcx
        adcq	%r10, %r8
        movq	$0x00, %r11
        adcq	$0x00, %r11
        shrdq	$0x01, %rax, %rdx
        shrdq	$0x01, %rcx, %rax
        shrdq	$0x01, %r8, %rcx
        shrdq	$0x01, %r11, %r8
        movq	%rdx, (%rdi)
        movq	%rax, 8(%rdi)
        movq	%rcx, 16(%rdi)
        movq	%r8, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_div2_sm2_4,.-sp_256_mont_div2_sm2_4
#endif /* __APPLE__ */
/* Two Montgomery numbers, subtract double second from first (r = a - 2.b % m).
 *
 * r   Result of subtration.
 * a   Number to subtract from in Montgomery form.
 * b   Number to double and subtract with in Montgomery form.
 * m   Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_rsb_sub_dbl_sm2_4
.type	sp_256_mont_rsb_sub_dbl_sm2_4,@function
.align	16
sp_256_mont_rsb_sub_dbl_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_rsb_sub_dbl_sm2_4
.p2align	4
_sp_256_mont_rsb_sub_dbl_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        movq	(%rsi), %rax
        movq	8(%rsi), %rcx
        movq	16(%rsi), %r8
        movq	24(%rsi), %r9
        movq	(%rdx), %r10
        movq	8(%rdx), %r11
        movq	16(%rdx), %r12
        movq	24(%rdx), %r13
        addq	%r10, %r10
        movq	$0xffffffff00000000, %r14
        adcq	%r11, %r11
        movq	$0xfffffffeffffffff, %r15
        adcq	%r12, %r12
        adcq	%r13, %r13
        sbbq	%rsi, %rsi
        andq	%rsi, %r14
        andq	%rsi, %r15
        subq	%rsi, %r10
        sbbq	%r14, %r11
        sbbq	%rsi, %r12
        sbbq	%r15, %r13
        adcq	$0x00, %rsi
        andq	%rsi, %r14
        andq	%rsi, %r15
        subq	%rsi, %r10
        sbbq	%r14, %r11
        sbbq	%rsi, %r12
        sbbq	%r15, %r13
        subq	%r10, %rax
        movq	$0xffffffff00000000, %r14
        sbbq	%r11, %rcx
        movq	$0xfffffffeffffffff, %r15
        sbbq	%r12, %r8
        sbbq	%r13, %r9
        sbbq	$0x00, %rsi
        andq	%rsi, %r14
        andq	%rsi, %r15
        addq	%rsi, %rax
        adcq	%r14, %rcx
        adcq	%rsi, %r8
        adcq	%r15, %r9
        adcq	$0x00, %rsi
        andq	%rsi, %r14
        andq	%rsi, %r15
        addq	%rsi, %rax
        adcq	%r14, %rcx
        movq	%rax, (%rdi)
        adcq	%rsi, %r8
        movq	%rcx, 8(%rdi)
        adcq	%r15, %r9
        movq	%r8, 16(%rdi)
        movq	%r9, 24(%rdi)
        movq	(%rdx), %r10
        movq	8(%rdx), %r11
        movq	16(%rdx), %r12
        movq	24(%rdx), %r13
        subq	%rax, %r10
        movq	$0xffffffff00000000, %r14
        sbbq	%rcx, %r11
        movq	$0xfffffffeffffffff, %r15
        sbbq	%r8, %r12
        sbbq	%r9, %r13
        sbbq	%rsi, %rsi
        andq	%rsi, %r14
        andq	%rsi, %r15
        addq	%rsi, %r10
        adcq	%r14, %r11
        adcq	%rsi, %r12
        adcq	%r15, %r13
        adcq	$0x00, %rsi
        andq	%rsi, %r14
        andq	%rsi, %r15
        addq	%rsi, %r10
        adcq	%r14, %r11
        movq	%r10, (%rdx)
        adcq	%rsi, %r12
        movq	%r11, 8(%rdx)
        adcq	%r15, %r13
        movq	%r12, 16(%rdx)
        movq	%r13, 24(%rdx)
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_rsb_sub_dbl_sm2_4,.-sp_256_mont_rsb_sub_dbl_sm2_4
#endif /* __APPLE__ */
#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible point that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of point to retrieve.
 */
#ifndef __APPLE__
.text
.globl	sp_256_get_point_33_sm2_4
.type	sp_256_get_point_33_sm2_4,@function
.align	16
sp_256_get_point_33_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_get_point_33_sm2_4
.p2align	4
_sp_256_get_point_33_sm2_4:
#endif /* __APPLE__ */
        movq	$0x01, %rax
        movd	%edx, %xmm13
        addq	$0xc8, %rsi
        movd	%eax, %xmm15
        movq	$32, %rax
        pshufd	$0x00, %xmm15, %xmm15
        pshufd	$0x00, %xmm13, %xmm13
        pxor	%xmm14, %xmm14
        pxor	%xmm0, %xmm0
        pxor	%xmm1, %xmm1
        pxor	%xmm2, %xmm2
        pxor	%xmm3, %xmm3
        pxor	%xmm4, %xmm4
        pxor	%xmm5, %xmm5
        movdqa	%xmm15, %xmm14
L_256_get_point_33_sm2_4_start_1:
        movdqa	%xmm14, %xmm12
        paddd	%xmm15, %xmm14
        pcmpeqd	%xmm13, %xmm12
        movdqu	(%rsi), %xmm6
        movdqu	16(%rsi), %xmm7
        movdqu	64(%rsi), %xmm8
        movdqu	80(%rsi), %xmm9
        movdqu	128(%rsi), %xmm10
        movdqu	144(%rsi), %xmm11
        addq	$0xc8, %rsi
        pand	%xmm12, %xmm6
        pand	%xmm12, %xmm7
        pand	%xmm12, %xmm8
        pand	%xmm12, %xmm9
        pand	%xmm12, %xmm10
        pand	%xmm12, %xmm11
        por	%xmm6, %xmm0
        por	%xmm7, %xmm1
        por	%xmm8, %xmm2
        por	%xmm9, %xmm3
        por	%xmm10, %xmm4
        por	%xmm11, %xmm5
        decq	%rax
        jnz	L_256_get_point_33_sm2_4_start_1
        movdqu	%xmm0, (%rdi)
        movdqu	%xmm1, 16(%rdi)
        movdqu	%xmm2, 64(%rdi)
        movdqu	%xmm3, 80(%rdi)
        movdqu	%xmm4, 128(%rdi)
        movdqu	%xmm5, 144(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_get_point_33_sm2_4,.-sp_256_get_point_33_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Touch each possible point that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of point to retrieve.
 */
#ifndef __APPLE__
.text
.globl	sp_256_get_point_33_avx2_sm2_4
.type	sp_256_get_point_33_avx2_sm2_4,@function
.align	16
sp_256_get_point_33_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_get_point_33_avx2_sm2_4
.p2align	4
_sp_256_get_point_33_avx2_sm2_4:
#endif /* __APPLE__ */
        movq	$0x01, %rax
        movd	%edx, %xmm7
        addq	$0xc8, %rsi
        movd	%eax, %xmm9
        movq	$32, %rax
        vpxor	%ymm8, %ymm8, %ymm8
        vpermd	%ymm7, %ymm8, %ymm7
        vpermd	%ymm9, %ymm8, %ymm9
        vpxor	%ymm0, %ymm0, %ymm0
        vpxor	%ymm1, %ymm1, %ymm1
        vpxor	%ymm2, %ymm2, %ymm2
        vmovdqa	%ymm9, %ymm8
L_256_get_point_33_avx2_sm2_4_start:
        vpcmpeqd	%ymm7, %ymm8, %ymm6
        vpaddd	%ymm9, %ymm8, %ymm8
        vmovupd	(%rsi), %ymm3
        vmovupd	64(%rsi), %ymm4
        vmovupd	128(%rsi), %ymm5
        addq	$0xc8, %rsi
        vpand	%ymm6, %ymm3, %ymm3
        vpand	%ymm6, %ymm4, %ymm4
        vpand	%ymm6, %ymm5, %ymm5
        vpor	%ymm3, %ymm0, %ymm0
        vpor	%ymm4, %ymm1, %ymm1
        vpor	%ymm5, %ymm2, %ymm2
        decq	%rax
        jnz	L_256_get_point_33_avx2_sm2_4_start
        vmovupd	%ymm0, (%rdi)
        vmovupd	%ymm1, 64(%rdi)
        vmovupd	%ymm2, 128(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_get_point_33_avx2_sm2_4,.-sp_256_get_point_33_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#endif /* !WC_NO_CACHE_RESISTANT */
#ifdef HAVE_INTEL_AVX2
/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_mul_avx2_sm2_4
.type	sp_256_mont_mul_avx2_sm2_4,@function
.align	16
sp_256_mont_mul_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_mul_avx2_sm2_4
.p2align	4
_sp_256_mont_mul_avx2_sm2_4:
#endif /* __APPLE__ */
        pushq	%rbp
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        movq	%rdx, %rbp
        movq	(%rsi), %rdx
        movq	8(%rbp), %r14
        # A[0] * B[0]
        mulxq	(%rbp), %r8, %r9
        xorq	%rbx, %rbx
        # A[0] * B[1]
        mulxq	%r14, %rax, %r10
        adcxq	%rax, %r9
        # A[0] * B[2]
        mulxq	16(%rbp), %rax, %r11
        adcxq	%rax, %r10
        # A[0] * B[3]
        mulxq	24(%rbp), %rax, %r12
        adcxq	%rax, %r11
        movq	8(%rsi), %rdx
        adcxq	%rbx, %r12
        # A[1] * B[0]
        mulxq	(%rbp), %rax, %rcx
        xorq	%rbx, %rbx
        adcxq	%rax, %r9
        # A[1] * B[1]
        mulxq	%r14, %rax, %r15
        adoxq	%rcx, %r10
        adcxq	%rax, %r10
        # A[1] * B[2]
        mulxq	16(%rbp), %rax, %rcx
        adoxq	%r15, %r11
        adcxq	%rax, %r11
        # A[1] * B[3]
        mulxq	24(%rbp), %rax, %r13
        adoxq	%rcx, %r12
        adcxq	%rax, %r12
        adoxq	%rbx, %r13
        movq	16(%rsi), %rdx
        adcxq	%rbx, %r13
        # A[2] * B[0]
        mulxq	(%rbp), %rax, %rcx
        xorq	%rbx, %rbx
        adcxq	%rax, %r10
        # A[2] * B[1]
        mulxq	%r14, %rax, %r15
        adoxq	%rcx, %r11
        adcxq	%rax, %r11
        # A[2] * B[2]
        mulxq	16(%rbp), %rax, %rcx
        adoxq	%r15, %r12
        adcxq	%rax, %r12
        # A[2] * B[3]
        mulxq	24(%rbp), %rax, %r14
        adoxq	%rcx, %r13
        adcxq	%rax, %r13
        adoxq	%rbx, %r14
        movq	24(%rsi), %rdx
        adcxq	%rbx, %r14
        # A[3] * B[0]
        mulxq	(%rbp), %rax, %rcx
        xorq	%rbx, %rbx
        adcxq	%rax, %r11
        # A[3] * B[1]
        mulxq	8(%rbp), %rax, %r15
        adoxq	%rcx, %r12
        adcxq	%rax, %r12
        # A[3] * B[2]
        mulxq	16(%rbp), %rax, %rcx
        adoxq	%r15, %r13
        adcxq	%rax, %r13
        # A[3] * B[3]
        mulxq	24(%rbp), %rax, %r15
        adoxq	%rcx, %r14
        adcxq	%rax, %r14
        adoxq	%rbx, %r15
        adcxq	%rbx, %r15
        # Start Reduction
        # mu = a[0..3] + a[0..2] << 64 - a[0..2] << 32 << 64
        #    + a[0..1] << 128 - (a[0..1] * 2) << 32 << 128
        #    + (a[0..0] * 2) << 192 - (a[0..0] * 4) << 32 << 192
        # mu = a[0..3]
        movq	%r11, %rdx
        #   + (a[0..0] * 2) << 192
        addq	%r8, %rdx
        movq	%r10, %rbp
        addq	%r8, %rdx
        #   + a[0..1) << 128
        addq	%r8, %rbp
        movq	%r9, %rsi
        adcq	%r9, %rdx
        #   + a[0..2] << 64
        addq	%r8, %rsi
        movq	%r8, %rax
        adcq	%r9, %rbp
        adcq	%r10, %rdx
        #   a[0..2] << 32
        shlq	$32, %r8
        shldq	$32, %r9, %r10
        shldq	$32, %rax, %r9
        #   - (a[0..1] * 2) << 32 << 128
        subq	%r8, %rbp
        sbbq	%r9, %rdx
        subq	%r8, %rbp
        sbbq	%r9, %rdx
        #   - a[0..2] << 32 << 64
        subq	%r8, %rsi
        sbbq	%r9, %rbp
        sbbq	%r10, %rdx
        #   - (a[0..0] * 4) << 32 << 192
        movq	%r8, %r10
        shlq	$2, %r10
        subq	%r10, %rdx
        # a += (mu << 256) - (mu << 224) - (mu << 96) + (mu << 64) - mu
        #   a += mu << 256
        xorq	%r8, %r8
        addq	%rax, %r12
        adcq	%rsi, %r13
        adcq	%rbp, %r14
        adcq	%rdx, %r15
        sbbq	$0x00, %r8
        #   a += mu << 64
        addq	%rbp, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        adcq	$0x00, %r14
        adcq	$0x00, %r15
        sbbq	$0x00, %r8
        # mu <<= 32
        movq	%rdx, %rcx
        shldq	$32, %rbp, %rdx
        shldq	$32, %rsi, %rbp
        shldq	$32, %rax, %rsi
        shrq	$32, %rcx
        shlq	$32, %rax
        #   a -= (mu << 32) << 64
        subq	%rbp, %r11
        sbbq	%rdx, %r12
        sbbq	%rcx, %r13
        sbbq	$0x00, %r14
        sbbq	$0x00, %r15
        adcq	$0x00, %r8
        #   a -= (mu << 32) << 192
        subq	%rax, %r11
        sbbq	%rsi, %r12
        sbbq	%rbp, %r13
        sbbq	%rdx, %r14
        sbbq	%rcx, %r15
        adcq	$0x00, %r8
        movq	$0xffffffff00000000, %rax
        movq	$0xfffffffeffffffff, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r8, %rax
        #  m[2] = -1 & mask = mask
        andq	%r8, %rsi
        subq	%r8, %r12
        sbbq	%rax, %r13
        sbbq	%r8, %r14
        sbbq	%rsi, %r15
        movq	%r12, (%rdi)
        movq	%r13, 8(%rdi)
        movq	%r14, 16(%rdi)
        movq	%r15, 24(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbp
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_mul_avx2_sm2_4,.-sp_256_mont_mul_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_sqr_avx2_sm2_4
.type	sp_256_mont_sqr_avx2_sm2_4,@function
.align	16
sp_256_mont_sqr_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_sqr_avx2_sm2_4
.p2align	4
_sp_256_mont_sqr_avx2_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        xorq	%r8, %r8
        movq	(%rsi), %rdx
        movq	8(%rsi), %rcx
        movq	16(%rsi), %rbx
        movq	24(%rsi), %r15
        # A[0] * A[1]
        mulxq	%rcx, %r9, %r10
        # A[0] * A[2]
        mulxq	%rbx, %r8, %r11
        adoxq	%r8, %r10
        # A[0] * A[3]
        mulxq	%r15, %r8, %r12
        movq	%rcx, %rdx
        adoxq	%r8, %r11
        # A[1] * A[2]
        mulxq	%rbx, %r8, %rax
        movq	%r15, %rdx
        adcxq	%r8, %r11
        # A[1] * A[3]
        mulxq	%rcx, %r8, %r13
        movq	$0x00, %r15
        adoxq	%rax, %r12
        adcxq	%r8, %r12
        # A[2] * A[3]
        mulxq	%rbx, %r8, %r14
        adoxq	%r15, %r13
        adcxq	%r8, %r13
        adoxq	%r15, %r14
        adcxq	%r15, %r14
        # Double with Carry Flag
        xorq	%r15, %r15
        # A[0] * A[0]
        movq	(%rsi), %rdx
        mulxq	%rdx, %r8, %rax
        adcxq	%r9, %r9
        adcxq	%r10, %r10
        adoxq	%rax, %r9
        # A[1] * A[1]
        movq	8(%rsi), %rdx
        mulxq	%rdx, %rcx, %rbx
        adcxq	%r11, %r11
        adoxq	%rcx, %r10
        # A[2] * A[2]
        movq	16(%rsi), %rdx
        mulxq	%rdx, %rax, %rcx
        adcxq	%r12, %r12
        adoxq	%rbx, %r11
        adcxq	%r13, %r13
        adoxq	%rax, %r12
        adcxq	%r14, %r14
        # A[3] * A[3]
        movq	24(%rsi), %rdx
        mulxq	%rdx, %rax, %rbx
        adoxq	%rcx, %r13
        adcxq	%r15, %r15
        adoxq	%rax, %r14
        adoxq	%rbx, %r15
        # Start Reduction
        # mu = a[0..3] + a[0..2] << 64 - a[0..2] << 32 << 64
        #    + a[0..1] << 128 - (a[0..1] * 2) << 32 << 128
        #    + (a[0..0] * 2) << 192 - (a[0..0] * 4) << 32 << 192
        # mu = a[0..3]
        movq	%r11, %rdx
        #   + (a[0..0] * 2) << 192
        addq	%r8, %rdx
        movq	%r10, %rcx
        addq	%r8, %rdx
        #   + a[0..1) << 128
        addq	%r8, %rcx
        movq	%r9, %rsi
        adcq	%r9, %rdx
        #   + a[0..2] << 64
        addq	%r8, %rsi
        movq	%r8, %rax
        adcq	%r9, %rcx
        adcq	%r10, %rdx
        #   a[0..2] << 32
        shlq	$32, %r8
        shldq	$32, %r9, %r10
        shldq	$32, %rax, %r9
        #   - (a[0..1] * 2) << 32 << 128
        subq	%r8, %rcx
        sbbq	%r9, %rdx
        subq	%r8, %rcx
        sbbq	%r9, %rdx
        #   - a[0..2] << 32 << 64
        subq	%r8, %rsi
        sbbq	%r9, %rcx
        sbbq	%r10, %rdx
        #   - (a[0..0] * 4) << 32 << 192
        movq	%r8, %r10
        shlq	$2, %r10
        subq	%r10, %rdx
        # a += (mu << 256) - (mu << 224) - (mu << 96) + (mu << 64) - mu
        #   a += mu << 256
        xorq	%r8, %r8
        addq	%rax, %r12
        adcq	%rsi, %r13
        adcq	%rcx, %r14
        adcq	%rdx, %r15
        sbbq	$0x00, %r8
        #   a += mu << 64
        addq	%rcx, %r11
        adcq	%rdx, %r12
        adcq	$0x00, %r13
        adcq	$0x00, %r14
        adcq	$0x00, %r15
        sbbq	$0x00, %r8
        # mu <<= 32
        movq	%rdx, %rbx
        shldq	$32, %rcx, %rdx
        shldq	$32, %rsi, %rcx
        shldq	$32, %rax, %rsi
        shrq	$32, %rbx
        shlq	$32, %rax
        #   a -= (mu << 32) << 64
        subq	%rcx, %r11
        sbbq	%rdx, %r12
        sbbq	%rbx, %r13
        sbbq	$0x00, %r14
        sbbq	$0x00, %r15
        adcq	$0x00, %r8
        #   a -= (mu << 32) << 192
        subq	%rax, %r11
        sbbq	%rsi, %r12
        sbbq	%rcx, %r13
        sbbq	%rdx, %r14
        sbbq	%rbx, %r15
        adcq	$0x00, %r8
        movq	$0xffffffff00000000, %rax
        movq	$0xfffffffeffffffff, %rsi
        # mask m and sub from result if overflow
        #  m[0] = -1 & mask = mask
        andq	%r8, %rax
        #  m[2] = -1 & mask = mask
        andq	%r8, %rsi
        subq	%r8, %r12
        sbbq	%rax, %r13
        sbbq	%r8, %r14
        sbbq	%rsi, %r15
        movq	%r12, (%rdi)
        movq	%r13, 8(%rdi)
        movq	%r14, 16(%rdi)
        movq	%r15, 24(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_sqr_avx2_sm2_4,.-sp_256_mont_sqr_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Reduce the number back to 256 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_reduce_order_avx2_sm2_4
.type	sp_256_mont_reduce_order_avx2_sm2_4,@function
.align	16
sp_256_mont_reduce_order_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_reduce_order_avx2_sm2_4
.p2align	4
_sp_256_mont_reduce_order_avx2_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbx
        movq	%rdx, %rax
        movq	(%rdi), %r12
        movq	8(%rdi), %r13
        movq	16(%rdi), %r14
        movq	24(%rdi), %r15
        xorq	%r11, %r11
        xorq	%r10, %r10
        # a[0-4] += m[0-3] * mu = m[0-3] * (a[0] * mp)
        movq	32(%rdi), %rbx
        #   mu = a[0] * mp
        movq	%r12, %rdx
        mulxq	%rax, %rdx, %rcx
        #   a[0] += m[0] * mu
        mulx	(%rsi), %r8, %r9
        adcxq	%r8, %r12
        #   a[1] += m[1] * mu
        mulx	8(%rsi), %r8, %rcx
        adoxq	%r9, %r13
        adcxq	%r8, %r13
        #   a[2] += m[2] * mu
        mulx	16(%rsi), %r8, %r9
        adoxq	%rcx, %r14
        adcxq	%r8, %r14
        #   a[3] += m[3] * mu
        mulx	24(%rsi), %r8, %rcx
        adoxq	%r9, %r15
        adcxq	%r8, %r15
        #   a[4] += carry
        adoxq	%rcx, %rbx
        adcxq	%r10, %rbx
        #   carry
        adoxq	%r10, %r11
        adcxq	%r10, %r11
        # a[1-5] += m[0-3] * mu = m[0-3] * (a[1] * mp)
        movq	40(%rdi), %r12
        #   mu = a[1] * mp
        movq	%r13, %rdx
        mulxq	%rax, %rdx, %rcx
        #   a[1] += m[0] * mu
        mulx	(%rsi), %r8, %r9
        adcxq	%r8, %r13
        #   a[2] += m[1] * mu
        mulx	8(%rsi), %r8, %rcx
        adoxq	%r9, %r14
        adcxq	%r8, %r14
        #   a[3] += m[2] * mu
        mulx	16(%rsi), %r8, %r9
        adoxq	%rcx, %r15
        adcxq	%r8, %r15
        #   a[4] += m[3] * mu
        mulx	24(%rsi), %r8, %rcx
        adoxq	%r9, %rbx
        adcxq	%r8, %rbx
        #   a[5] += carry
        adoxq	%rcx, %r12
        adcxq	%r11, %r12
        movq	%r10, %r11
        #   carry
        adoxq	%r10, %r11
        adcxq	%r10, %r11
        # a[2-6] += m[0-3] * mu = m[0-3] * (a[2] * mp)
        movq	48(%rdi), %r13
        #   mu = a[2] * mp
        movq	%r14, %rdx
        mulxq	%rax, %rdx, %rcx
        #   a[2] += m[0] * mu
        mulx	(%rsi), %r8, %r9
        adcxq	%r8, %r14
        #   a[3] += m[1] * mu
        mulx	8(%rsi), %r8, %rcx
        adoxq	%r9, %r15
        adcxq	%r8, %r15
        #   a[4] += m[2] * mu
        mulx	16(%rsi), %r8, %r9
        adoxq	%rcx, %rbx
        adcxq	%r8, %rbx
        #   a[5] += m[3] * mu
        mulx	24(%rsi), %r8, %rcx
        adoxq	%r9, %r12
        adcxq	%r8, %r12
        #   a[6] += carry
        adoxq	%rcx, %r13
        adcxq	%r11, %r13
        movq	%r10, %r11
        #   carry
        adoxq	%r10, %r11
        adcxq	%r10, %r11
        # a[3-7] += m[0-3] * mu = m[0-3] * (a[3] * mp)
        movq	56(%rdi), %r14
        #   mu = a[3] * mp
        movq	%r15, %rdx
        mulxq	%rax, %rdx, %rcx
        #   a[3] += m[0] * mu
        mulx	(%rsi), %r8, %r9
        adcxq	%r8, %r15
        #   a[4] += m[1] * mu
        mulx	8(%rsi), %r8, %rcx
        adoxq	%r9, %rbx
        adcxq	%r8, %rbx
        #   a[5] += m[2] * mu
        mulx	16(%rsi), %r8, %r9
        adoxq	%rcx, %r12
        adcxq	%r8, %r12
        #   a[6] += m[3] * mu
        mulx	24(%rsi), %r8, %rcx
        adoxq	%r9, %r13
        adcxq	%r8, %r13
        #   a[7] += carry
        adoxq	%rcx, %r14
        adcxq	%r11, %r14
        movq	%r10, %r11
        #   carry
        adoxq	%r10, %r11
        adcxq	%r10, %r11
        # Subtract mod if carry
        negq	%r11
        movq	$0x53bbf40939d54123, %r8
        movq	$0x7203df6b21c6052b, %r9
        movq	$0xfffffffeffffffff, %rdx
        andq	%r11, %r8
        andq	%r11, %r9
        andq	%r11, %rdx
        subq	%r8, %rbx
        sbbq	%r9, %r12
        sbbq	%r11, %r13
        sbbq	%rdx, %r14
        movq	%rbx, (%rdi)
        movq	%r12, 8(%rdi)
        movq	%r13, 16(%rdi)
        movq	%r14, 24(%rdi)
        popq	%rbx
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_reduce_order_avx2_sm2_4,.-sp_256_mont_reduce_order_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 *
 * r  Result of division by 2.
 * a  Number to divide.
 * m  Modulus (prime).
 */
#ifndef __APPLE__
.text
.globl	sp_256_mont_div2_avx2_sm2_4
.type	sp_256_mont_div2_avx2_sm2_4,@function
.align	16
sp_256_mont_div2_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_mont_div2_avx2_sm2_4
.p2align	4
_sp_256_mont_div2_avx2_sm2_4:
#endif /* __APPLE__ */
        movq	(%rsi), %rdx
        movq	8(%rsi), %rax
        movq	16(%rsi), %rcx
        movq	24(%rsi), %r8
        movq	$0xffffffff00000000, %r9
        movq	$0xfffffffeffffffff, %r10
        movq	%rdx, %r11
        andq	$0x01, %r11
        negq	%r11
        andq	%r11, %r9
        andq	%r11, %r10
        addq	%r11, %rdx
        adcq	%r9, %rax
        adcq	%r11, %rcx
        adcq	%r10, %r8
        movq	$0x00, %r11
        adcq	$0x00, %r11
        shrdq	$0x01, %rax, %rdx
        shrdq	$0x01, %rcx, %rax
        shrdq	$0x01, %r8, %rcx
        shrdq	$0x01, %r11, %r8
        movq	%rdx, (%rdi)
        movq	%rax, 8(%rdi)
        movq	%rcx, 16(%rdi)
        movq	%r8, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_mont_div2_avx2_sm2_4,.-sp_256_mont_div2_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible entry that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
#ifndef __APPLE__
.text
.globl	sp_256_get_entry_64_sm2_4
.type	sp_256_get_entry_64_sm2_4,@function
.align	16
sp_256_get_entry_64_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_get_entry_64_sm2_4
.p2align	4
_sp_256_get_entry_64_sm2_4:
#endif /* __APPLE__ */
        # From entry 1
        movq	$0x01, %rax
        movd	%edx, %xmm9
        addq	$0x40, %rsi
        movd	%eax, %xmm11
        movq	$63, %rax
        pshufd	$0x00, %xmm11, %xmm11
        pshufd	$0x00, %xmm9, %xmm9
        pxor	%xmm10, %xmm10
        pxor	%xmm0, %xmm0
        pxor	%xmm1, %xmm1
        pxor	%xmm2, %xmm2
        pxor	%xmm3, %xmm3
        movdqa	%xmm11, %xmm10
L_256_get_entry_64_sm2_4_start_0:
        movdqa	%xmm10, %xmm8
        paddd	%xmm11, %xmm10
        pcmpeqd	%xmm9, %xmm8
        movdqu	(%rsi), %xmm4
        movdqu	16(%rsi), %xmm5
        movdqu	32(%rsi), %xmm6
        movdqu	48(%rsi), %xmm7
        addq	$0x40, %rsi
        pand	%xmm8, %xmm4
        pand	%xmm8, %xmm5
        pand	%xmm8, %xmm6
        pand	%xmm8, %xmm7
        por	%xmm4, %xmm0
        por	%xmm5, %xmm1
        por	%xmm6, %xmm2
        por	%xmm7, %xmm3
        decq	%rax
        jnz	L_256_get_entry_64_sm2_4_start_0
        movdqu	%xmm0, (%rdi)
        movdqu	%xmm1, 16(%rdi)
        movdqu	%xmm2, 64(%rdi)
        movdqu	%xmm3, 80(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_get_entry_64_sm2_4,.-sp_256_get_entry_64_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Touch each possible entry that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
#ifndef __APPLE__
.text
.globl	sp_256_get_entry_64_avx2_sm2_4
.type	sp_256_get_entry_64_avx2_sm2_4,@function
.align	16
sp_256_get_entry_64_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_get_entry_64_avx2_sm2_4
.p2align	4
_sp_256_get_entry_64_avx2_sm2_4:
#endif /* __APPLE__ */
        movq	$0x01, %rax
        movd	%edx, %xmm5
        addq	$0x40, %rsi
        movd	%eax, %xmm7
        movq	$0x40, %rax
        vpxor	%ymm6, %ymm6, %ymm6
        vpermd	%ymm5, %ymm6, %ymm5
        vpermd	%ymm7, %ymm6, %ymm7
        vpxor	%ymm0, %ymm0, %ymm0
        vpxor	%ymm1, %ymm1, %ymm1
        vmovdqa	%ymm7, %ymm6
L_256_get_entry_64_avx2_sm2_4_start:
        vpcmpeqd	%ymm5, %ymm6, %ymm4
        vpaddd	%ymm7, %ymm6, %ymm6
        vmovupd	(%rsi), %ymm2
        vmovupd	32(%rsi), %ymm3
        addq	$0x40, %rsi
        vpand	%ymm4, %ymm2, %ymm2
        vpand	%ymm4, %ymm3, %ymm3
        vpor	%ymm2, %ymm0, %ymm0
        vpor	%ymm3, %ymm1, %ymm1
        decq	%rax
        jnz	L_256_get_entry_64_avx2_sm2_4_start
        vmovupd	%ymm0, (%rdi)
        vmovupd	%ymm1, 64(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_get_entry_64_avx2_sm2_4,.-sp_256_get_entry_64_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#endif /* !WC_NO_CACHE_RESISTANT */
#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible entry that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
#ifndef __APPLE__
.text
.globl	sp_256_get_entry_65_sm2_4
.type	sp_256_get_entry_65_sm2_4,@function
.align	16
sp_256_get_entry_65_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_get_entry_65_sm2_4
.p2align	4
_sp_256_get_entry_65_sm2_4:
#endif /* __APPLE__ */
        # From entry 1
        movq	$0x01, %rax
        movd	%edx, %xmm9
        addq	$0x40, %rsi
        movd	%eax, %xmm11
        movq	$0x40, %rax
        pshufd	$0x00, %xmm11, %xmm11
        pshufd	$0x00, %xmm9, %xmm9
        pxor	%xmm10, %xmm10
        pxor	%xmm0, %xmm0
        pxor	%xmm1, %xmm1
        pxor	%xmm2, %xmm2
        pxor	%xmm3, %xmm3
        movdqa	%xmm11, %xmm10
L_256_get_entry_65_sm2_4_start_0:
        movdqa	%xmm10, %xmm8
        paddd	%xmm11, %xmm10
        pcmpeqd	%xmm9, %xmm8
        movdqu	(%rsi), %xmm4
        movdqu	16(%rsi), %xmm5
        movdqu	32(%rsi), %xmm6
        movdqu	48(%rsi), %xmm7
        addq	$0x40, %rsi
        pand	%xmm8, %xmm4
        pand	%xmm8, %xmm5
        pand	%xmm8, %xmm6
        pand	%xmm8, %xmm7
        por	%xmm4, %xmm0
        por	%xmm5, %xmm1
        por	%xmm6, %xmm2
        por	%xmm7, %xmm3
        decq	%rax
        jnz	L_256_get_entry_65_sm2_4_start_0
        movdqu	%xmm0, (%rdi)
        movdqu	%xmm1, 16(%rdi)
        movdqu	%xmm2, 64(%rdi)
        movdqu	%xmm3, 80(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_get_entry_65_sm2_4,.-sp_256_get_entry_65_sm2_4
#endif /* __APPLE__ */
#ifdef HAVE_INTEL_AVX2
/* Touch each possible entry that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
#ifndef __APPLE__
.text
.globl	sp_256_get_entry_65_avx2_sm2_4
.type	sp_256_get_entry_65_avx2_sm2_4,@function
.align	16
sp_256_get_entry_65_avx2_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_get_entry_65_avx2_sm2_4
.p2align	4
_sp_256_get_entry_65_avx2_sm2_4:
#endif /* __APPLE__ */
        movq	$0x01, %rax
        movd	%edx, %xmm5
        addq	$0x40, %rsi
        movd	%eax, %xmm7
        movq	$0x41, %rax
        vpxor	%ymm6, %ymm6, %ymm6
        vpermd	%ymm5, %ymm6, %ymm5
        vpermd	%ymm7, %ymm6, %ymm7
        vpxor	%ymm0, %ymm0, %ymm0
        vpxor	%ymm1, %ymm1, %ymm1
        vmovdqa	%ymm7, %ymm6
L_256_get_entry_65_avx2_sm2_4_start:
        vpcmpeqd	%ymm5, %ymm6, %ymm4
        vpaddd	%ymm7, %ymm6, %ymm6
        vmovupd	(%rsi), %ymm2
        vmovupd	32(%rsi), %ymm3
        addq	$0x40, %rsi
        vpand	%ymm4, %ymm2, %ymm2
        vpand	%ymm4, %ymm3, %ymm3
        vpor	%ymm2, %ymm0, %ymm0
        vpor	%ymm3, %ymm1, %ymm1
        decq	%rax
        jnz	L_256_get_entry_65_avx2_sm2_4_start
        vmovupd	%ymm0, (%rdi)
        vmovupd	%ymm1, 64(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_get_entry_65_avx2_sm2_4,.-sp_256_get_entry_65_avx2_sm2_4
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX2 */
#endif /* !WC_NO_CACHE_RESISTANT */
/* Add 1 to a. (a = a + 1)
 *
 * a  A single precision integer.
 */
#ifndef __APPLE__
.text
.globl	sp_256_add_one_sm2_4
.type	sp_256_add_one_sm2_4,@function
.align	16
sp_256_add_one_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_add_one_sm2_4
.p2align	4
_sp_256_add_one_sm2_4:
#endif /* __APPLE__ */
        addq	$0x01, (%rdi)
        adcq	$0x00, 8(%rdi)
        adcq	$0x00, 16(%rdi)
        adcq	$0x00, 24(%rdi)
        repz retq
#ifndef __APPLE__
.size	sp_256_add_one_sm2_4,.-sp_256_add_one_sm2_4
#endif /* __APPLE__ */
/* Read big endian unsigned byte array into r.
 * Uses the bswap instruction.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
#ifndef __APPLE__
.text
.globl	sp_256_from_bin_sm2_bswap
.type	sp_256_from_bin_sm2_bswap,@function
.align	16
sp_256_from_bin_sm2_bswap:
#else
.section	__TEXT,__text
.globl	_sp_256_from_bin_sm2_bswap
.p2align	4
_sp_256_from_bin_sm2_bswap:
#endif /* __APPLE__ */
        movq	%rdx, %r9
        movq	%rdi, %r10
        addq	%rcx, %r9
        addq	$32, %r10
        xorq	%r11, %r11
        jmp	L_256_from_bin_sm2_bswap_64_end
L_256_from_bin_sm2_bswap_64_start:
        subq	$0x40, %r9
        movq	56(%r9), %rax
        movq	48(%r9), %r8
        bswapq	%rax
        bswapq	%r8
        movq	%rax, (%rdi)
        movq	%r8, 8(%rdi)
        movq	40(%r9), %rax
        movq	32(%r9), %r8
        bswapq	%rax
        bswapq	%r8
        movq	%rax, 16(%rdi)
        movq	%r8, 24(%rdi)
        movq	24(%r9), %rax
        movq	16(%r9), %r8
        bswapq	%rax
        bswapq	%r8
        movq	%rax, 32(%rdi)
        movq	%r8, 40(%rdi)
        movq	8(%r9), %rax
        movq	(%r9), %r8
        bswapq	%rax
        bswapq	%r8
        movq	%rax, 48(%rdi)
        movq	%r8, 56(%rdi)
        addq	$0x40, %rdi
        subq	$0x40, %rcx
L_256_from_bin_sm2_bswap_64_end:
        cmpq	$63, %rcx
        jg	L_256_from_bin_sm2_bswap_64_start
        jmp	L_256_from_bin_sm2_bswap_8_end
L_256_from_bin_sm2_bswap_8_start:
        subq	$8, %r9
        movq	(%r9), %rax
        bswapq	%rax
        movq	%rax, (%rdi)
        addq	$8, %rdi
        subq	$8, %rcx
L_256_from_bin_sm2_bswap_8_end:
        cmpq	$7, %rcx
        jg	L_256_from_bin_sm2_bswap_8_start
        cmpq	%r11, %rcx
        je	L_256_from_bin_sm2_bswap_hi_end
        movq	%r11, %r8
        movq	%r11, %rax
L_256_from_bin_sm2_bswap_hi_start:
        movb	(%rdx), %al
        shlq	$8, %r8
        incq	%rdx
        addq	%rax, %r8
        decq	%rcx
        jg	L_256_from_bin_sm2_bswap_hi_start
        movq	%r8, (%rdi)
        addq	$8, %rdi
L_256_from_bin_sm2_bswap_hi_end:
        cmpq	%r10, %rdi
        jge	L_256_from_bin_sm2_bswap_zero_end
L_256_from_bin_sm2_bswap_zero_start:
        movq	%r11, (%rdi)
        addq	$8, %rdi
        cmpq	%r10, %rdi
        jl	L_256_from_bin_sm2_bswap_zero_start
L_256_from_bin_sm2_bswap_zero_end:
        repz retq
#ifndef __APPLE__
.size	sp_256_from_bin_sm2_bswap,.-sp_256_from_bin_sm2_bswap
#endif /* __APPLE__ */
#ifndef NO_MOVBE_SUPPORT
/* Read big endian unsigned byte array into r.
 * Uses the movbe instruction which is an optional instruction.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
#ifndef __APPLE__
.text
.globl	sp_256_from_bin_sm2_movbe
.type	sp_256_from_bin_sm2_movbe,@function
.align	16
sp_256_from_bin_sm2_movbe:
#else
.section	__TEXT,__text
.globl	_sp_256_from_bin_sm2_movbe
.p2align	4
_sp_256_from_bin_sm2_movbe:
#endif /* __APPLE__ */
        movq	%rdx, %r9
        movq	%rdi, %r10
        addq	%rcx, %r9
        addq	$32, %r10
        jmp	L_256_from_bin_sm2_movbe_64_end
L_256_from_bin_sm2_movbe_64_start:
        subq	$0x40, %r9
        movbeq	56(%r9), %rax
        movbeq	48(%r9), %r8
        movq	%rax, (%rdi)
        movq	%r8, 8(%rdi)
        movbeq	40(%r9), %rax
        movbeq	32(%r9), %r8
        movq	%rax, 16(%rdi)
        movq	%r8, 24(%rdi)
        movbeq	24(%r9), %rax
        movbeq	16(%r9), %r8
        movq	%rax, 32(%rdi)
        movq	%r8, 40(%rdi)
        movbeq	8(%r9), %rax
        movbeq	(%r9), %r8
        movq	%rax, 48(%rdi)
        movq	%r8, 56(%rdi)
        addq	$0x40, %rdi
        subq	$0x40, %rcx
L_256_from_bin_sm2_movbe_64_end:
        cmpq	$63, %rcx
        jg	L_256_from_bin_sm2_movbe_64_start
        jmp	L_256_from_bin_sm2_movbe_8_end
L_256_from_bin_sm2_movbe_8_start:
        subq	$8, %r9
        movbeq	(%r9), %rax
        movq	%rax, (%rdi)
        addq	$8, %rdi
        subq	$8, %rcx
L_256_from_bin_sm2_movbe_8_end:
        cmpq	$7, %rcx
        jg	L_256_from_bin_sm2_movbe_8_start
        cmpq	$0x00, %rcx
        je	L_256_from_bin_sm2_movbe_hi_end
        movq	$0x00, %r8
        movq	$0x00, %rax
L_256_from_binsm2__movbe_hi_start:
        movb	(%rdx), %al
        shlq	$8, %r8
        incq	%rdx
        addq	%rax, %r8
        decq	%rcx
        jg	L_256_from_binsm2__movbe_hi_start
        movq	%r8, (%rdi)
        addq	$8, %rdi
L_256_from_bin_sm2_movbe_hi_end:
        cmpq	%r10, %rdi
        jge	L_256_from_bin_sm2_movbe_zero_end
L_256_from_bin_sm2_movbe_zero_start:
        movq	$0x00, (%rdi)
        addq	$8, %rdi
        cmpq	%r10, %rdi
        jl	L_256_from_bin_sm2_movbe_zero_start
L_256_from_bin_sm2_movbe_zero_end:
        repz retq
#ifndef __APPLE__
.size	sp_256_from_bin_sm2_movbe,.-sp_256_from_bin_sm2_movbe
#endif /* __APPLE__ */
#endif /* !NO_MOVBE_SUPPORT */
/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 32
 * Uses the bswap instruction.
 *
 * r  A single precision integer.
 * a  Byte array.
 */
#ifndef __APPLE__
.text
.globl	sp_256_to_bin_bswap_sm2_4
.type	sp_256_to_bin_bswap_sm2_4,@function
.align	16
sp_256_to_bin_bswap_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_to_bin_bswap_sm2_4
.p2align	4
_sp_256_to_bin_bswap_sm2_4:
#endif /* __APPLE__ */
        movq	24(%rdi), %rdx
        movq	16(%rdi), %rax
        bswapq	%rdx
        bswapq	%rax
        movq	%rdx, (%rsi)
        movq	%rax, 8(%rsi)
        movq	8(%rdi), %rdx
        movq	(%rdi), %rax
        bswapq	%rdx
        bswapq	%rax
        movq	%rdx, 16(%rsi)
        movq	%rax, 24(%rsi)
        repz retq
#ifndef __APPLE__
.size	sp_256_to_bin_bswap_sm2_4,.-sp_256_to_bin_bswap_sm2_4
#endif /* __APPLE__ */
#ifndef NO_MOVBE_SUPPORT
/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 32
 * Uses the movbe instruction which is optional.
 *
 * r  A single precision integer.
 * a  Byte array.
 */
#ifndef __APPLE__
.text
.globl	sp_256_to_bin_movbe_sm2_4
.type	sp_256_to_bin_movbe_sm2_4,@function
.align	16
sp_256_to_bin_movbe_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_to_bin_movbe_sm2_4
.p2align	4
_sp_256_to_bin_movbe_sm2_4:
#endif /* __APPLE__ */
        movbeq	24(%rdi), %rdx
        movbeq	16(%rdi), %rax
        movq	%rdx, (%rsi)
        movq	%rax, 8(%rsi)
        movbeq	8(%rdi), %rdx
        movbeq	(%rdi), %rax
        movq	%rdx, 16(%rsi)
        movq	%rax, 24(%rsi)
        repz retq
#ifndef __APPLE__
.size	sp_256_to_bin_movbe_sm2_4,.-sp_256_to_bin_movbe_sm2_4
#endif /* __APPLE__ */
#endif /* NO_MOVBE_SUPPORT */
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
#ifndef __APPLE__
.text
.globl	sp_256_cond_add_sm2_4
.type	sp_256_cond_add_sm2_4,@function
.align	16
sp_256_cond_add_sm2_4:
#else
.section	__TEXT,__text
.globl	_sp_256_cond_add_sm2_4
.p2align	4
_sp_256_cond_add_sm2_4:
#endif /* __APPLE__ */
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        movq	$0x00, %rax
        movq	(%rdx), %r12
        movq	8(%rdx), %r13
        movq	16(%rdx), %r14
        movq	24(%rdx), %r15
        andq	%rcx, %r12
        andq	%rcx, %r13
        andq	%rcx, %r14
        andq	%rcx, %r15
        movq	(%rsi), %r8
        movq	8(%rsi), %r9
        movq	16(%rsi), %r10
        movq	24(%rsi), %r11
        addq	%r12, %r8
        adcq	%r13, %r9
        adcq	%r14, %r10
        adcq	%r15, %r11
        movq	%r8, (%rdi)
        movq	%r9, 8(%rdi)
        movq	%r10, 16(%rdi)
        movq	%r11, 24(%rdi)
        adcq	$0x00, %rax
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        repz retq
#ifndef __APPLE__
.size	sp_256_cond_add_sm2_4,.-sp_256_cond_add_sm2_4
#endif /* __APPLE__ */
#endif /* WOLFSSL_SP_SM2 */
#endif /* WOLFSSL_SP_X86_64_ASM */

#if defined(__linux__) && defined(__ELF__)
.section	.note.GNU-stack,"",%progbits
#endif
